# install.packages("sf")
# install.packages("leaflet")
library(sf)
library(tidyverse)
library(leaflet)
I have been provided these three datasets for this project:
# read the csv of burglaries data into the notebook
burglary_incidents <- read_csv('../data/burglaries_2023.csv')
# view the dataset
burglary_incidents
NA
Investigate the ethnicity column some:
# investigate the ethnicity column some
unique(burglary_incidents[["victim_ethnicity"]])
[1] "Non-Hispanic" NA "Hispanic" "Unknown"
# read the csv of census data into the notebook
census <- read_csv('../data/census.csv')
# view the dataset
census
NA
# read in the shape file data for Davidson county census tracts
# read in the DC file data
dav_cty_census_tracts <- read_sf('../data/DC/DC.shp')
dav_cty_census_tracts
Simple feature collection with 174 features and 12 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -87.0547 ymin: 35.96778 xmax: -86.51559 ymax: 36.4055
Geodetic CRS: NAD83
burglary_incidents
dav_cty_census_tracts |>
ggplot() +
geom_sf()
dav_cty_census_tracts |>
ggplot() +
geom_sf(aes(fill = ALAND))
Perform a spatial join to determine the census tract in which each burglary occurred. Hint: You may want to make use of the st_as_sf function in order to convert the burglaries data into an sf object.
# performed spatial join
burglary_incidents
burglary_incidents_mapped <- st_as_sf(
burglary_incidents |>
drop_na(latitude) |>
drop_na(longitude),
coords = c('longitude', 'latitude'),
crs = st_crs(dav_cty_census_tracts)
)
burglary_incidents_mapped
Simple feature collection with 1146 features and 27 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -92.51 ymin: 34.15 xmax: -86.557 ymax: 36.34
Geodetic CRS: NAD83
Rename column in Davidson county census tract data so that the merge goes more smoothly.
dav_cty_census_tracts <- rename(dav_cty_census_tracts, tract_name = NAME)
Merge census csv data with dav_cty_census_tracts DC shape file data.
census_tracts <- merge(dav_cty_census_tracts, census, by.x = "TRACTCE", by.y = "tract", all = TRUE)
census_tracts
Simple feature collection with 174 features and 17 fields
Geometry type: MULTIPOLYGON
Dimension: XY
Bounding box: xmin: -87.0547 ymin: 35.96778 xmax: -86.51559 ymax: 36.4055
Geodetic CRS: NAD83
First 10 features:
TRACTCE STATEFP COUNTYFP GEOID tract_name NAMELSAD MTFCC FUNCSTAT ALAND AWATER
1 010103 47 037 47037010103 101.03 Census Tract 101.03 G5020 S 48034082 61097
2 010104 47 037 47037010104 101.04 Census Tract 101.04 G5020 S 65057849 251504
3 010105 47 037 47037010105 101.05 Census Tract 101.05 G5020 S 28328799 1093
4 010106 47 037 47037010106 101.06 Census Tract 101.06 G5020 S 21616474 6845
5 010201 47 037 47037010201 102.01 Census Tract 102.01 G5020 S 23718545 0
6 010202 47 037 47037010202 102.02 Census Tract 102.02 G5020 S 68394934 77571
7 010301 47 037 47037010301 103.01 Census Tract 103.01 G5020 S 8527942 11775
8 010302 47 037 47037010302 103.02 Census Tract 103.02 G5020 S 4179336 6813
9 010303 47 037 47037010303 103.03 Census Tract 103.03 G5020 S 4508896 142888
10 010401 47 037 47037010401 104.01 Census Tract 104.01 G5020 S 9543414 320298
INTPTLAT INTPTLON NAME state county population
1 +36.3444054 -086.8608396 Census Tract 101.03, Davidson County, Tennessee 47 037 2411
2 +36.2940028 -086.8777483 Census Tract 101.04, Davidson County, Tennessee 47 037 3002
3 +36.2504208 -086.8521501 Census Tract 101.05, Davidson County, Tennessee 47 037 4839
4 +36.2610013 -086.8023491 Census Tract 101.06, Davidson County, Tennessee 47 037 2948
5 +36.2882537 -086.7728157 Census Tract 102.01, Davidson County, Tennessee 47 037 4283
6 +36.3619781 -086.7746355 Census Tract 102.02, Davidson County, Tennessee 47 037 3919
7 +36.3161492 -086.7261435 Census Tract 103.01, Davidson County, Tennessee 47 037 3914
8 +36.3139482 -086.7125964 Census Tract 103.02, Davidson County, Tennessee 47 037 1589
9 +36.3132279 -086.7006728 Census Tract 103.03, Davidson County, Tennessee 47 037 5114
10 +36.2943965 -086.6864670 Census Tract 104.01, Davidson County, Tennessee 47 037 4734
median_income geometry
1 60000 MULTIPOLYGON (((-86.91752 3...
2 84831 MULTIPOLYGON (((-86.9744 36...
3 61115 MULTIPOLYGON (((-86.89144 3...
4 66940 MULTIPOLYGON (((-86.83089 3...
5 69185 MULTIPOLYGON (((-86.81736 3...
6 81695 MULTIPOLYGON (((-86.82483 3...
7 52806 MULTIPOLYGON (((-86.74132 3...
8 50341 MULTIPOLYGON (((-86.72469 3...
9 46604 MULTIPOLYGON (((-86.71971 3...
10 47025 MULTIPOLYGON (((-86.71149 3...
# burglary_incidents_mapped
census_tracts |>
ggplot() +
geom_sf()
burglary_incidents_mapped_filtered <- st_filter(burglary_incidents_mapped, census_tracts)
census_tracts |>
ggplot() +
geom_sf() +
geom_sf(data = burglary_incidents_mapped_filtered, size = 0.1)
NA
NA
NA
NA
After performing the spatial join, merge in the census data. Note: Make sure that the final dataset contains all census tracts, even those with zero burglaries.
burglary_census_combo <- st_join(burglary_incidents_mapped, census_tracts, join = st_within, left=FALSE)
burglary_census_combo
Simple feature collection with 1142 features and 44 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -87.02 ymin: 35.99 xmax: -86.557 ymax: 36.34
Geodetic CRS: NAD83
Perform some exploratory analysis on your prepared dataset.
Classes of the two datasets:
class(census_tracts)
[1] "sf" "data.frame"
class(burglary_census_combo)
[1] "sf" "tbl_df" "tbl" "data.frame"
Curious as to the highest number of victims in one burglary.
burglary_census_combo |>
filter(victim_number == max(victim_number, na.rm = TRUE))
Simple feature collection with 1 feature and 44 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -86.69 ymin: 36.15 xmax: -86.69 ymax: 36.15
Geodetic CRS: NAD83
Limit dataset to non repeated incident numbers and locate the highest number of victims per indcident number.
real_num_burglaries <- burglary_census_combo |>
group_by(incident_number) |>
filter(victim_number == max(victim_number, na.rm = TRUE)) |>
arrange(desc(victim_number))
real_num_burglaries
Simple feature collection with 894 features and 44 fields
Geometry type: POINT
Dimension: XY
Bounding box: xmin: -87.02 ymin: 35.99 xmax: -86.557 ymax: 36.34
Geodetic CRS: NAD83
Calculate the accurate number of burglaries in each tract.
burglaries_per_tract_real <- real_num_burglaries |>
st_drop_geometry() |>
group_by(TRACTCE) |>
count(name = "num_burglaries") |>
arrange(desc(num_burglaries))
burglaries_per_tract_real
NA
Comparing the non filtered number to the result before filtering:
burglaries_per_tract <- burglary_census_combo |>
st_drop_geometry() |>
group_by(TRACTCE) |>
count(name = "num_burglaries") |>
arrange(desc(num_burglaries))
burglaries_per_tract
NA
Aggregate the data by census tract. Warning: each incident can appear multiple times if there are multiple victims, so be sure that you aren’t double-counting any incidents.
burglaries_per_tract_real
NA
Which census tract had the highest number of burglaries?
# zip = 37207
#
# zipcodes |>
# filter(zipcode == zip) |>
# ggplot() +
# geom_sf() +
# geom_sf(data = bus_zips |> filter(zipcode == zip),
# aes(color = `Route Name`))
tract = 016000
real_num_burglaries |>
filter(TRACTCE == tract) |>
ggplot() +
geom_sf(aes(color = `incident_number`)) +
geom_sf(data = census_tracts |> filter(TRACTCE == tract))
NA
NA
Which census tract had the highest number of burglaries per 1000 residents?
We’re interested in the relationship between median income and number of aggravated burglaries, so examine those variables on their own and together to see what you can find. You may want to perform additional calculations, create plots, etc.